In this kernel we will be examining prices rates in King county,USA and how the prices vary by (location,living space,number of bedrooms,etc…),we will explore our data and try to find patterns ,This kernel will include a map that contain the distribution of the houses and their prices,and we will create a model that can predict the price of the houses.
libraries
library(caTools)
library(plotly)
library(randomcoloR)
library(tidyverse)
library(psych)
library(ggpubr)
library(wesanderson)
library(backports)
library(lubridate)
library(ggplot2)
library(moonBook)
library(mycor)
library(ggcorrplot)
library(scales)
library(leaflet)
library(wesanderson)
library(randomForest)
library(caret)
library(infer)Columns of Data
1-id = Identify number
2-price = House price in dollar
3-bedrooms = Count of bedrooms
4-bathrooms = Count of bathrooms
5-sqft_living = Living space
6-sqft_lot = Square footage of house on land
7-floors= Count of floors
8-waterfront = House on the seaside or not (1/0)
9-view = View point of house (0 - 4)
10-condition = Conditions point of house (0 - 5)
11-grade = Point of house (1 - 13)
12-sqft_above = Square footage of the above ground
13-sqft_basement = Square footage of the below ground
14-yr_built = The year of Hous was build
15-yr_renovated = The year of House was renovate
16-zipcode = Zipcode of house
17-lat = Lattitude
18-long = Longitude
19-sqft_living15 = Living space in houses,they were sold in 2015
20-sal_year = The year house was sold
21-sqft_lot15 = Square footage of house on land,the houses sold in 2015
## id date price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000 221900 3 1.00 1180 5650
## 2 6414100192 20141209T000000 538000 3 2.25 2570 7242
## 3 5631500400 20150225T000000 180000 2 1.00 770 10000
## 4 2487200875 20141209T000000 604000 4 3.00 1960 5000
## 5 1954400510 20150218T000000 510000 3 2.00 1680 8080
## 6 7237550310 20140512T000000 1225000 4 4.50 5420 101930
## floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1 1 0 0 3 7 1180 0 1955
## 2 2 0 0 3 7 2170 400 1951
## 3 1 0 0 3 6 770 0 1933
## 4 1 0 0 5 7 1050 910 1965
## 5 1 0 0 3 8 1680 0 1987
## 6 1 0 0 3 11 3890 1530 2001
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 0 98178 47.5112 -122.257 1340 5650
## 2 1991 98125 47.7210 -122.319 1690 7639
## 3 0 98028 47.7379 -122.233 2720 8062
## 4 0 98136 47.5208 -122.393 1360 5000
## 5 0 98074 47.6168 -122.045 1800 7503
## 6 0 98053 47.6561 -122.005 4760 101930
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
## vars n mean sd median trimmed
## id 1 21613 4580301520.86 2.876566e+09 3.90493e+09 4500014357.18
## date* 2 21613 178.30 1.095000e+02 1.68000e+02 176.64
## price 3 21613 540088.14 3.671272e+05 4.50000e+05 481704.02
## bedrooms 4 21613 3.37 9.300000e-01 3.00000e+00 3.34
## bathrooms 5 21613 2.11 7.700000e-01 2.25000e+00 2.07
## sqft_living 6 21613 2079.90 9.184400e+02 1.91000e+03 1984.40
## sqft_lot 7 21613 15106.97 4.142051e+04 7.61800e+03 8259.53
## floors 8 21613 1.49 5.400000e-01 1.50000e+00 1.45
## waterfront 9 21613 0.01 9.000000e-02 0.00000e+00 0.00
## view 10 21613 0.23 7.700000e-01 0.00000e+00 0.00
## condition 11 21613 3.41 6.500000e-01 3.00000e+00 3.30
## grade 12 21613 7.66 1.180000e+00 7.00000e+00 7.58
## sqft_above 13 21613 1788.39 8.280900e+02 1.56000e+03 1682.94
## sqft_basement 14 21613 291.51 4.425800e+02 0.00000e+00 205.25
## yr_built 15 21613 1971.01 2.937000e+01 1.97500e+03 1973.10
## yr_renovated 16 21613 84.40 4.016800e+02 0.00000e+00 0.00
## zipcode 17 21613 98077.94 5.351000e+01 9.80650e+04 98074.72
## lat 18 21613 47.56 1.400000e-01 4.75700e+01 47.57
## long 19 21613 -122.21 1.400000e-01 -1.22230e+02 -122.23
## sqft_living15 20 21613 1986.55 6.853900e+02 1.84000e+03 1914.07
## sqft_lot15 21 21613 12768.46 2.730418e+04 7.62000e+03 7903.21
## mad min max range skew kurtosis
## id 3.561991e+09 1000102.00 9900000190.00 9.899000e+09 0.24 -1.26
## date* 1.438100e+02 1.00 372.00 3.710000e+02 0.15 -1.27
## price 2.223900e+05 75000.00 7700000.00 7.625000e+06 4.02 34.57
## bedrooms 1.480000e+00 0.00 33.00 3.300000e+01 1.97 49.05
## bathrooms 7.400000e-01 0.00 8.00 8.000000e+00 0.51 1.28
## sqft_living 8.006000e+02 290.00 13540.00 1.325000e+04 1.47 5.24
## sqft_lot 3.881450e+03 520.00 1651359.00 1.650839e+06 13.06 284.98
## floors 7.400000e-01 1.00 3.50 2.500000e+00 0.62 -0.49
## waterfront 0.000000e+00 0.00 1.00 1.000000e+00 11.38 127.59
## view 0.000000e+00 0.00 4.00 4.000000e+00 3.40 10.89
## condition 0.000000e+00 1.00 5.00 4.000000e+00 1.03 0.53
## grade 1.480000e+00 1.00 13.00 1.200000e+01 0.77 1.19
## sqft_above 6.671700e+02 290.00 9410.00 9.120000e+03 1.45 3.40
## sqft_basement 0.000000e+00 0.00 4820.00 4.820000e+03 1.58 2.71
## yr_built 3.410000e+01 1900.00 2015.00 1.150000e+02 -0.47 -0.66
## yr_renovated 0.000000e+00 0.00 2015.00 2.015000e+03 4.55 18.69
## zipcode 6.227000e+01 98001.00 98199.00 1.980000e+02 0.41 -0.85
## lat 1.600000e-01 47.16 47.78 6.200000e-01 -0.49 -0.68
## long 1.500000e-01 -122.52 -121.32 1.200000e+00 0.88 1.05
## sqft_living15 6.078700e+02 399.00 6210.00 5.811000e+03 1.11 1.60
## sqft_lot15 3.713910e+03 651.00 871200.00 8.705490e+05 9.51 150.71
## se
## id 19566662.38
## date* 0.74
## price 2497.23
## bedrooms 0.01
## bathrooms 0.01
## sqft_living 6.25
## sqft_lot 281.75
## floors 0.00
## waterfront 0.00
## view 0.01
## condition 0.00
## grade 0.01
## sqft_above 5.63
## sqft_basement 3.01
## yr_built 0.20
## yr_renovated 2.73
## zipcode 0.36
## lat 0.00
## long 0.00
## sqft_living15 4.66
## sqft_lot15 185.73
we can remove the month and day because i think its not important in our job
remove unwanted columns
gghistogram(houses$price,fill ="skyblue",bins =60,title =" Price Distrbuation")+
scale_x_continuous(labels = label_number_si())gghistogram(houses$price,fill="skyblue",bins =150,title =" Price Distrbuation < 1M",interactive=TRUE)+
scale_x_continuous(labels = scales::comma)+
scale_x_continuous(labels = scales::dollar)+
coord_cartesian(x=c(0,1000000))Bedrooms It is obvious that more bedrooms means higher price but this plot show that 11 bedrooms price less than 7 ,that because the effect of other variables.
houses$bedrooms<-factor(houses$bedrooms)
houses %>% filter(!(bedrooms=="30"|bedrooms=="33")) %>%
group_by(bedrooms) %>% summarise(mean=mean(price)) %>%
ggbarplot(x="bedrooms",y="mean",fill = "bedrooms",palette = "Set3",size = 1.5)+
scale_y_continuous(labels = scales::dollar,n.breaks = 15)+
theme(legend.position = "none")+
labs(title = " Price Average By Bedrooms")Grades
houses$grade<-factor(houses$grade)
houses %>% group_by(grade) %>% summarise(mean=mean(price)) %>%
ggbarplot(houses,x="grade",y="mean",fill = "grade",palette = randomColor(count=12,luminosity = "light"),size = 1.5)+
scale_y_continuous(labels = scales::dollar,n.breaks = 15)+
theme(legend.position = "none")+
labs(title = "Price Average by grade") View
houses$view<-factor(houses$view)
houses %>% group_by(view) %>% summarise(mean=mean(price)) %>%
ggbarplot(x="view",y="mean",fill="view",palette = wes_palette("Darjeeling1"),size = 1.5)+
theme(legend.position = "none")+scale_y_continuous(labels = dollar,n.breaks = 15)+
labs(title = " Price Average By View")Floors
houses$floors<-round(houses$floors) %>% factor()
houses %>% group_by(floors) %>% summarise(mean=mean(price)) %>%
ggbarplot(houses,x="floors",y="mean",fill = "floors",palette = wes_palette("BottleRocket2"),size = 1.5)+
scale_y_continuous(n.breaks = 10,labels = label_dollar())+
labs(title = "Price Average By Floors Number")+
theme(legend.position = "none")living Space
ggscatter(houses,"sqft_living","price",color = "purple",size=0.5,alpha=0.5)+ylim(70000,2000000)+geom_smooth(method="lm",size=1.2,col="darkred",size=2)+xlim(0,5000)+labs(x = 'Tooal living space', y = 'Price (USD)',title = "Price By living Space")+scale_y_continuous(labels = label_dollar(),n.breaks = 10)+coord_cartesian(y=c(0,2000000))Year Solid
## # A tibble: 2 x 2
## date mean
## <fct> <dbl>
## 1 2014 539181.
## 2 2015 541989.
Year Build
houses$yearb<-cut(houses$yr_built,c(1900,1950,2000,2020))
houses$yearb<-factor(houses$yearb,levels = c("(1.9e+03,1.95e+03]","(1.95e+03,2e+03]","(2e+03,2.02e+03]"),labels = c("1900-1950","1950-2000","2000-2020"))
houses %>% group_by(yearb) %>% summarise(mean=mean(price,na.rm = TRUE)) %>%filter(!is.na(yearb)) %>%
ggbarplot("yearb","mean",fill="yearb",col="black",size = 2)+labs(title = "Price Average By Year Built",x="Year Built",y="Praice Average")+scale_y_continuous(labels = dollar,n.breaks = 10)+theme(legend.position = "none")houses$PriceBin<-cut(houses$price, c(0,250000,500000,750000,1000000,2000000,99900000))
houses$PriceBin<-factor(houses$PriceBin,levels = c("(0,2.5e+05]","(2.5e+05,5e+05]","(5e+05,7.5e+05]","(7.5e+05,1e+06]","(1e+06,2e+06]","(2e+06,9.99e+07]"),labels = c("0 - 250K $","250K $ - 500K $","500K $ - 750K $","750K $ - 1M $","1M $ - 2M $","2M $ - 10M $"))
center_lon = median(houses$long,na.rm = TRUE)
center_lat = median(houses$lat,na.rm = TRUE)
factpal <- colorFactor(c("red","blue","yellow","orange","#0B5345","black"),
houses$PriceBin)
labels <- sprintf(
"<strong>%s</strong><br/>%s",
'Price: ',dollar(houses$price),'Bedrooms', houses$bedrooms
) %>% lapply(htmltools::HTML)
leaflet(houses) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~long, lat = ~lat
,color = ~factpal(PriceBin),label =labels,popup = paste0('Bedrooms: ',houses$bedrooms,', Living Space: ',houses$sqft_living)) %>%
setView(lng=center_lon, lat=center_lat,zoom = 12) %>%
addLegend("bottomright", pal = factpal, values = ~PriceBin,
title = "House Price Distribution",
opacity = 1)First we need to remove some variables
houses1<- read.csv("E:/test files/kc_house_data.csv", stringsAsFactors=TRUE)
houses1 = dplyr::select(houses1,-id,-date,-yr_renovated,-zipcode)Now split our data into train and test data
sample=sample.split(houses1$price,SplitRatio=0.75)
train=subset(houses1,sample==T)
test=subset(houses1,sample==F)Create the model
##
## Call:
## randomForest(formula = price ~ ., data = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 19006646713
## % Var explained: 87.03
We see that the model can explain 87% of our data which is good
Test the model
## RMSE Rsquared MAE
## 1.059046e+05 8.771086e-01 6.463886e+04
The accuracy of the model is 87%
We did some changes in our data so that we improve the model,for example we converted the year built from numeric to factors and convert basement space to (true,false) factor,and some rounding . and resampling.
houses1$yearb<-cut(houses$yr_built,c(1900,1950,2000,2020))
houses1$yearb<-factor(houses1$yearb,levels = c("(1.9e+03,1.95e+03]","(1.95e+03,2e+03]","(2e+03,2.02e+03]"),labels = c("1900-1950","1950-2000","2000-2020"))
houses1=houses1 %>% dplyr::select(-yr_built)
houses1$bathrooms=round(houses1$bathrooms)
houses1$floors<-round(houses1$floors)
houses1<-houses1 %>% filter(!is.na(yearb))
houses1$basement<-ifelse(houses1$sqft_basement>0,1,0)
houses1<-houses1 %>% dplyr::select(-sqft_basement)
sample=sample.split(houses1$price,SplitRatio=0.75)
train=subset(houses1,sample==T)
test=subset(houses1,sample==F)
train=rep_sample_n(train,size =16884,reps = 3,replace = TRUE)
model=randomForest(price~.,train[,-1])
model##
## Call:
## randomForest(formula = price ~ ., data = train[, -1])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 1594180771
## % Var explained: 98.92
The model improved by more than 10% ,very good improvement